# libraries in use

library(tidyverse)
library(dplyr)
library(usmap)
library(ggplot2)
library(maps)
library(mapdata)
library(plotly)

# Benjamin's read and clean function

read_and_clean_county_data <- function(file_path_name, state, outcome) {
  read_csv(file_path_name,
           skip = 8) %>%
  janitor::clean_names() %>%
  select(county, age_adjusted_incidence_rate_rate_note_cases_per_100_000) %>%
  mutate(county = stringr::str_remove(county, "\\(.\\)$"),
          state = state,
         outcome = outcome,
         age_adjusted_incidence_rate =
           as.numeric(age_adjusted_incidence_rate_rate_note_cases_per_100_000)) %>%
  select(state, county, outcome, age_adjusted_incidence_rate)
}

Next, I loaded in the county data sets using this function:

#county-level lung cancer PA data https://statecancerprofiles.cancer.gov/

pa_county_lc = read_and_clean_county_data('./data/lung/pa_lung_data_county.csv', 
                                          state = "PA",
                                          outcome = "lung cancer") %>%
                                          filter(complete.cases(.)) %>%
                                          slice(-c(1, 2))
## New names:
## * `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...5`
## * `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...6`
## * `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...13`
## * `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...14`
## Rows: 87 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): County, FIPS, Met Healthy People Objective of ***?, CI*Rank([rank n...
## dbl (7): Age-Adjusted Incidence Rate([rate note]) - cases per 100,000, Lower...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: One or more parsing issues, see `problems()` for details
#county-level lung cancer OH data https://statecancerprofiles.cancer.gov/

oh_county_lc = read_and_clean_county_data('./data/lung/oh_lung_data_county.csv', 
                                          state = "OH", 
                                          outcome = "lung cancer") %>%
                                          filter(complete.cases(.)) %>%
                                          slice(-c(1, 2))
## New names:
## * `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...5`
## * `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...6`
## * `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...13`
## * `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...14`
## Rows: 108 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): County, FIPS, Met Healthy People Objective of ***?, CI*Rank([rank n...
## dbl (7): Age-Adjusted Incidence Rate([rate note]) - cases per 100,000, Lower...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: One or more parsing issues, see `problems()` for details
#county-level lung cancer NY data https://statecancerprofiles.cancer.gov/

ny_county_lc = read_and_clean_county_data('./data/lung/ny_lung_data_county.csv', 
                                          state = "NY", 
                                          outcome = "lung cancer") %>%
                                          filter(complete.cases(.)) %>%
                                          slice(-c(1, 2))
## New names:
## * `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...5`
## * `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...6`
## * `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...13`
## * `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...14`
## Rows: 82 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): County, FIPS, Met Healthy People Objective of ***?, CI*Rank([rank n...
## dbl (7): Age-Adjusted Incidence Rate([rate note]) - cases per 100,000, Lower...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: One or more parsing issues, see `problems()` for details

Finally, I loaded in the state-level lung cancer data

#state-level lung cancer PA data https://www.phaim1.health.pa.gov/EDD/

pa_cancer = read.csv('./data/lung/pa_state_lung.csv', sep = ";", header = T, skip = 3) %>%
  janitor::clean_names() %>%
  select(year, rate_ratio_result) %>%
  map_df(str_replace, pattern = ",", replacement = ".") %>%
    map_df(as.numeric) %>%
    rename(c("age_adjusted_incidence_rate" = "rate_ratio_result")) %>%
  add_column(state = "PA")

#state-level lung cancer OH data https://publicapps.odh.ohio.gov/EDW/DataBrowser/Browse/StateLayoutLockdownCancers

oh_cancer = read.csv('./data/lung/oh_state_lung.csv', sep = ";") %>%
  janitor::clean_names() %>%
  rename(year = cancer_year_year) %>%
  select(year, age_adjusted_rate) %>%
    map_df(str_replace, pattern = ",", replacement = ".") %>%
    map_df(as.numeric) %>%
    rename(c("age_adjusted_incidence_rate" = "age_adjusted_rate")) %>%
  add_column(state = "OH") %>%
  distinct() %>%
  filter(complete.cases(.))
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
#state-level lung cancer NY data https://www.health.ny.gov/statistics/cancer/registry/table2/tb2lungnys.htm

ny_cancer = read.csv("./data/lung/ny_state_lung.csv", skip = 2)[ ,1:3] %>%
  janitor::clean_names() %>%
  rename(year = x) %>%
  select(year, rate_per_100_000_population) %>%
  rename(c("age_adjusted_incidence_rate" = "rate_per_100_000_population")) %>%
  add_column(state = "NY")
# merging county level data for lung cancer

fips_codes = read.csv('./data/fips_codes.csv') %>%
  janitor::clean_names() %>%
  filter(state %in% c("NY", "PA", "OH")) %>%
  rename(c("county" = "name"))



fips_codes$county[fips_codes$county == "St Lawrence"] <- "St. Lawrence"

county_lc <- rbind(pa_county_lc, oh_county_lc, ny_county_lc) %>%
   mutate(county = (gsub('.{7}$', '', county)))

lc_fips <- merge(county_lc, fips_codes, by = c("state", "county"))

# lung cancer incidence maps by county

ny_map <- plot_usmap(data = lc_fips, values = "age_adjusted_incidence_rate", "counties", include = c("NY"), color = "black") +  
    labs(title = "New York Age-Adjusted Lung Cancer Rates, 2014-2018") +  
    scale_fill_continuous(low = "#FFE9C7", high = "#FF0000", 
                          name = "age_adjusted_incidence_rate", label = scales::comma) +  
    theme(plot.background = element_rect(), legend.position = "right")  

oh_map <- plot_usmap(data = lc_fips, values = "age_adjusted_incidence_rate", "counties", include = c("OH"), color = "black") +  
    labs(title = "Ohio Age-Adjusted Lung Cancer Rates, 2014-2018") +  
    scale_fill_continuous(low = "#FFE9C7", high = "#FF0000", 
                          name = "age_adjusted_incidence_rate", label = scales::comma) +  
    theme(plot.background = element_rect(), legend.position = "right")  

pa_map <- plot_usmap(data = lc_fips, values = "age_adjusted_incidence_rate", "counties", include = c("PA"), color = "black") +  
    labs(title = "Pennsylvania Age-Adjusted Lung Cancer Rates, 2014-2018") +  
    scale_fill_continuous(low = "#FFE9C7", high = "#FF0000", 
                          name = "age_adjusted_incidence_rate", label = scales::comma) +  
    theme(plot.background = element_rect(), legend.position = "right")  

ny_oh_pa_lungcancer <- plot_usmap(data = lc_fips, values = "age_adjusted_incidence_rate", "counties", include = c("NY", "OH", "PA"), color = "black") +  
    labs(title = "NY, OH, and PA Age-Adjusted Lung Cancer Rates, 2014-2018") +  
    scale_fill_continuous(low = "#FFE9C7", high = "#FF0000", 
                          name = "age_adjusted_incidence_rate", label = scales::comma) +  
    theme(plot.background = element_rect(), legend.position = "right")  
# merging state level data for lung cancer

state_lc <- rbind(pa_cancer, oh_cancer, ny_cancer)

state_lc_wide = state_lc %>%
  pivot_wider(
    names_from = state,
    values_from = age_adjusted_incidence_rate) %>%
  rename(c("PA_AAIR" = "PA", "NY_AAIR" = "NY", "OH_AAIR" = "OH"))

write_csv(state_lc_wide, "./data/state_lc_wide.csv")
# plotly of county level data

fig <- plot_ly(state_lc_wide, x = ~year)
fig <- fig %>% add_lines(y = ~PA_AAIR, name = "Pennsylvania")
fig <- fig %>% add_lines(y = ~NY_AAIR, name = "New York")
fig <- fig %>% add_lines(y = ~OH_AAIR, name = "Ohio")
fig <- fig %>% layout(
    title = "Lung Cancer Age-Adjusted Incidence Rates",
    xaxis = list(
      rangeselector = list(
        buttons = list(
          list(
            count = 1,
            label = "1 yr",
            step = "year",
            stepmode = "backward"),
          list(
            count = 5,
            label = "5 yr",
            step = "year",
            stepmode = "backward"),
          list(
            count = 10,
            label = "10 yr",
            step = "year",
            stepmode = "backward"),
          list(step = "all"))),

      rangeslider = list(type = "year")),

    yaxis = list(title = "Age-Adjusted Incidence Rate (per 100,000"))

fig
apuv = read.csv('./ap/ap_uv/apuv.csv')

average_med_aqi =
  apuv %>%
  group_by(state, year) %>%
  summarise_at(vars(pm25_med_pred), list(avg_med_aqi = mean))


lc_aqi_data <- merge(average_med_aqi, state_lc, by = c("year", "state"))

average_med_aqi_wide =
  average_med_aqi %>%
  pivot_wider(
    names_from = state,
    values_from = avg_med_aqi) %>%
  rename(c("oh_avg_med_aqi" = "OH", "ny_avg_med_aqi" = "NY", "pa_avg_med_aqi" = "PA")) %>%
  subset(select = -c(ME))


write_csv(lc_aqi_data, "./data/lc_aqi_data.csv")

fig1 <- lc_aqi_data %>%
  plot_ly(
    x = ~age_adjusted_incidence,
    y = ~y,
    frame = ~f,
    type = 'scatter',
    mode = 'markers',
    showlegend = F
  )

fig1 <- lc_aqi_data %>%
  plot_ly(
    x = ~avg_med_aqi, 
    y = ~age_adjusted_incidence_rate, 
    color = ~state, 
    frame = ~year, 
    text = ~state, 
    hoverinfo = "text",
    type = 'scatter',
    mode = 'markers'
  )
fig1 <- fig1 %>% layout(
  title = "Lung Cancer AAIR and Average Median AQI",
    xaxis = list(title = "Average Median AQI", 
      type = "log"
    ),
    yaxis = list(title = "Age Adjusted Incidence Rate")
  )


fig1
ny_fips = lc_fips %>%
  filter(state == "NY")
pa_fips = lc_fips %>%
  filter(state == "PA")
oh_fips = lc_fips %>%
  filter(state == "OH")


library(rjson)
library(plotly)
url <- 'https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json'
counties <- rjson::fromJSON(file = url)



county_map_lc <- plot_ly()
county_map_lc <- county_map_lc %>%
  add_trace(
    type = "choroplethmapbox",
    geojson = counties,
    locations = lc_fips$fips,
    z = lc_fips$age_adjusted_incidence_rate,
    text = lc_fips$county,
    colorscale = "Viridis",
    marker = list(line = list(
      width = 0),
      opacity = 0.5)
    )
county_map_lc <- county_map_lc %>%
  layout(
    title = "Age-Adjusted Lung Cancer Incidence Rates (2014-2018)",
    mapbox = list(
      style = "carto-positron",
      zoom = 4,
      center = list(lon = -77.215135, lat = 41.164818)
    )
  )


county_map_lc
lm_lc_aqi = lm(age_adjusted_incidence_rate ~ avg_med_aqi, data = lc_aqi_data)

summary(lm_lc_aqi)
## 
## Call:
## lm(formula = age_adjusted_incidence_rate ~ avg_med_aqi, data = lc_aqi_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3391 -2.0145  0.2906  1.6527  5.6890 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  44.6447     1.8196   24.54   <2e-16 ***
## avg_med_aqi   2.3193     0.1776   13.06   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.465 on 46 degrees of freedom
## Multiple R-squared:  0.7875, Adjusted R-squared:  0.7829 
## F-statistic: 170.5 on 1 and 46 DF,  p-value: < 2.2e-16
plot(lc_aqi_data$age_adjusted_incidence_rate, lc_aqi_data$avg_med_aqi)

lm_lc_aqi = lm(age_adjusted_incidence_rate ~ avg_med_aqi, data = lc_aqi_data)

lc_aqi_data %>% 
  plot_ly(x = ~avg_med_aqi) %>% 
  add_markers(y = ~age_adjusted_incidence_rate, name = "AQI") %>% 
  add_lines(x = ~avg_med_aqi, y = fitted(lm_lc_aqi), name = "Regression Fit")
write_csv(lc_fips, "./data/lung/lc_fips.csv")

write_csv(county_lc, "./data/lung/county_lc.csv")

state_lc_data = state_lc %>%
  mutate(outcome = "lung cancer")


write_csv(state_lc_data, "./data/lung/state_lc_data.csv")